The Institute of Mixture Modeling for Equity-Oriented Researchers, Scholars, and Educators (IMMERSE) is an IES funded training grant (R305B220021) to support education scholars in integrating mixture modeling into their research.
Please visit our website to learn more.
Visit our GitHub account to access all the IMMERSE materials.
Follow us on Twitter!
How to reference this workshop: Institute of Mixture Modeling for Equity-Oriented Researchers, Scholars, and Educators (2024). IMMERSE In-Person Training Workshop (IES No. 305B220021). Institute of Education Sciences. https://immerse-ucsb.github.io/cohort-two/pre-training
Load packages
library(MplusAutomation)
library(tidyverse) #collection of R packages designed for data science
library(here) #helps with filepaths
library(janitor) #clean_names
library(gt) # create tables
library(cowplot) # a ggplot theme
library(DiagrammeR) # create path diagrams
library(glue)
here::i_am("lab3_enumeration.Rmd")
Data source: The dataset used in this example comes from the Longitudinal Study of American Life (LSAL; Miller, 2010), funded by the National Science Foundation (NSF) in 1986. Science attitude items for the LCA were selected based on prior research using the same data (Ing & Nylund-Gibson, 2013, 2017) to create the latent class variable.
| Binary LCA indicators1 | ||
| Name | Description | Values |
|---|---|---|
| Enjoy | I Enjoy Science | 0 = Disagree, 1 = Agree |
| Useful | Science is Useful in Everyday Problems | 0 = Disagree, 1 = Agree |
| Logical | Science Helps Logical Thinking | 0 = Disagree, 1 = Agree |
| Job | Need Science for a Good Job | 0 = Disagree, 1 = Agree |
| Adult | Will Use Science Often as an Adult | 0 = Disagree, 1 = Agree |
| 1 Longitudinal Study of American Life (LSAL) | ||
The data can be found in the data folder and is called
lsay_subset.csv.
lsay_data <- read_csv(here("data","lsay_subset.csv")) %>%
clean_names() # make variable names lowercase
Proportion of indicators using R:
# Set up data to find proportions of binary indicators
ds <- lsay_data %>%
pivot_longer(c(enjoy, good, useful, logical, job, adult), names_to = "Variable")
# Create table of variables and counts
tab <- table(ds$Variable, ds$value)
# Find proportions and round to 3 decimal places
prop <- prop.table(tab, margin = 1) %>%
round(3)
# Combine everything to one table
dframe <- data.frame(Variables=rownames(tab), Proportion=prop[,2], Count=tab[,2])
#remove row names
row.names(dframe) <- NULL
# Make it a gt() table
prop_table <- dframe %>%
gt()
prop_table
| Variables | Proportion | Count |
|---|---|---|
| adult | 0.702 | 1858 |
| enjoy | 0.669 | 1784 |
| good | 0.693 | 1850 |
| job | 0.743 | 1947 |
| logical | 0.640 | 1686 |
| useful | 0.695 | 1835 |
# Save as a Word doc
gtsave(prop_table, here("figures", "prop_table.docx"))
This code uses the mplusObject function in the
MplusAutomation package and saves all model runs in the
enum folder.
lca_6 <- lapply(1:6, function(k) {
lca_enum <- mplusObject(
TITLE = glue("{k}-Class"),
VARIABLE = glue(
"categorical = enjoy useful logical job adult;
usevar = enjoy useful logical job adult;
classes = c({k});"),
ANALYSIS =
"estimator = mlr;
type = mixture;
starts = 500 100;",
OUTPUT = "tech11 tech14 svalues;",
usevariables = colnames(lsay_data),
rdata = lsay_data)
lca_enum_fit <- mplusModeler(lca_enum,
dataout=glue(here("enum", "LSAL_data.dat")),
modelout=glue(here("enum", "c{k}_lsal.inp")) ,
check=TRUE, run = TRUE, hashfilename = FALSE)
})
First, extract data:
output_lsal <- readModels(here("enum"))
enum_extract <- LatexSummaryTable(
output_lsal,
keepCols = c(
"Title",
"Parameters",
"LL",
"BIC",
"aBIC",
"BLRT_PValue",
"T11_VLMR_PValue",
"Observations"
),
sortBy = "Title"
) #%>% slice_head(n=4) # Select first four models (Class 1 through 4)
allFit <- enum_extract %>%
mutate(CAIC = -2 * LL + Parameters * (log(Observations) + 1)) %>%
mutate(AWE = -2 * LL + 2 * Parameters * (log(Observations) + 1.5)) %>%
mutate(SIC = -.5 * BIC) %>%
mutate(expSIC = exp(SIC - max(SIC))) %>%
mutate(BF = exp(SIC - lead(SIC))) %>%
mutate(cmPk = expSIC / sum(expSIC)) %>%
dplyr::select(1:5, 9:10, 6:7, 13, 14) %>%
arrange(Parameters)
Then, create table:
fit_table <- allFit %>%
gt() %>%
tab_header(title = md("**Model Fit Summary Table**")) %>%
cols_label(
Title = "Classes",
Parameters = md("Par"),
LL = md("*LL*"),
T11_VLMR_PValue = "VLMR",
BLRT_PValue = "BLRT",
BF = md("BF"),
cmPk = md("*cmPk*")
) %>%
tab_footnote(
footnote = md(
"*Note.* Par = Parameters; *LL* = model log likelihood;
BIC = Bayesian information criterion;
aBIC = sample size adjusted BIC; CAIC = consistent Akaike information criterion;
AWE = approximate weight of evidence criterion;
BLRT = bootstrapped likelihood ratio test p-value;
VLMR = Vuong-Lo-Mendell-Rubin adjusted likelihood ratio test p-value;
*cmPk* = approximate correct model probability."
),
locations = cells_title()
) %>%
tab_options(column_labels.font.weight = "bold") %>%
fmt_number(c(3:7),
decimals = 2) %>%
sub_missing(1:11,
missing_text = "--") %>%
fmt(
c(8:9, 11),
fns = function(x)
ifelse(x < 0.001, "<.001",
scales::number(x, accuracy = .01))
) %>%
fmt(
10,
fns = function (x)
ifelse(x > 100, ">100",
scales::number(x, accuracy = .01))
) %>%
tab_style(
style = list(
cell_text(weight = "bold")
),
locations = list(cells_body(
columns = BIC,
row = BIC == min(BIC[1:6]) # Change this to the number of classes you estimated
),
cells_body(
columns = aBIC,
row = aBIC == min(aBIC[1:6])
),
cells_body(
columns = CAIC,
row = CAIC == min(CAIC[1:6])
),
cells_body(
columns = AWE,
row = AWE == min(AWE[1:6])
),
cells_body(
columns = cmPk,
row = cmPk == max(cmPk[1:6])
),
cells_body(
columns = BF,
row = BF > 10),
cells_body(
columns = T11_VLMR_PValue,
row = ifelse(T11_VLMR_PValue < .001 & lead(T11_VLMR_PValue) > .05, T11_VLMR_PValue < .001, NA)),
cells_body(
columns = BLRT_PValue,
row = ifelse(BLRT_PValue < .001 & lead(BLRT_PValue) > .05, BLRT_PValue < .001, NA))
)
)
fit_table
| Model Fit Summary Table1 | ||||||||||
| Classes | Par | LL | BIC | aBIC | CAIC | AWE | BLRT | VLMR | BF | cmPk |
|---|---|---|---|---|---|---|---|---|---|---|
| 1-Class | 5 | −8,150.35 | 16,340.16 | 16,324.27 | 16,345.16 | 16,394.62 | – | – | 0.00 | <.001 |
| 2-Class | 11 | −7,191.88 | 14,470.57 | 14,435.61 | 14,481.56 | 14,590.37 | <.001 | <.001 | 0.00 | <.001 |
| 3-Class | 17 | −7,124.92 | 14,384.00 | 14,329.99 | 14,401.00 | 14,569.16 | <.001 | <.001 | 0.00 | 0.00 |
| 4-Class | 23 | −7,095.12 | 14,371.76 | 14,298.68 | 14,394.76 | 14,622.26 | <.001 | <.001 | >100 | 1.00 |
| 5-Class | 29 | −7,091.95 | 14,412.75 | 14,320.61 | 14,441.75 | 14,728.61 | 0.67 | 0.42 | >100 | <.001 |
| 6-Class | 35 | −7,090.89 | 14,457.98 | 14,346.78 | 14,492.98 | 14,839.19 | 0.67 | 0.58 | – | <.001 |
| 1 Note. Par = Parameters; LL = model log likelihood; BIC = Bayesian information criterion; aBIC = sample size adjusted BIC; CAIC = consistent Akaike information criterion; AWE = approximate weight of evidence criterion; BLRT = bootstrapped likelihood ratio test p-value; VLMR = Vuong-Lo-Mendell-Rubin adjusted likelihood ratio test p-value; cmPk = approximate correct model probability. | ||||||||||
Save table:
gtsave(fit_table, here("figures","fit_table.png"))
allFit %>%
dplyr::select(2:7) %>%
rowid_to_column() %>%
pivot_longer(`BIC`:`AWE`,
names_to = "Index",
values_to = "ic_value") %>%
mutate(Index = factor(Index,
levels = c ("AWE", "CAIC", "BIC", "aBIC"))) %>%
ggplot(aes(
x = rowid,
y = ic_value,
color = Index,
shape = Index,
group = Index,
lty = Index
)) +
geom_point(size = 2.0) + geom_line(size = .8) +
scale_x_continuous(breaks = 1:nrow(allFit)) +
scale_colour_grey(end = .5) +
theme_cowplot() +
labs(x = "Number of Classes", y = "Information Criteria Value", title = "Information Criteria") +
theme(
text = element_text(family = "serif", size = 12),
legend.text = element_text(family="serif", size=12),
legend.key.width = unit(3, "line"),
legend.title = element_blank(),
legend.position = "top"
)
Save figure:
ggsave(here("figures", "info_criteria.png"), dpi=300, height=5, width=7, units="in")
Compare probability plots for \(K = 1:4\) class solutions
model_results <- data.frame()
for (i in 1:length(output_lsal)) {
temp <- output_lsal[[i]]$parameters$probability.scale %>%
mutate(model = paste0(i, "-Class Model"))
model_results <- rbind(model_results, temp)
}
compare_plot <-
model_results %>%
filter(category == 2) %>%
dplyr::select(est, model, LatentClass, param) %>%
filter(model != "5-Class Model" &
model != "6-Class Model") #Remove from plot
compare_plot$param <- fct_inorder(compare_plot$param)
ggplot(
compare_plot,
aes(
x = param,
y = est,
color = LatentClass,
shape = LatentClass,
group = LatentClass,
lty = LatentClass
)
) +
geom_point() +
geom_line() +
scale_colour_viridis_d() +
facet_wrap(~ model, ncol = 2) +
labs(title = "Science Attitude Items", x = " ", y = "Probability") +
theme_minimal() +
theme(panel.grid.major.y = element_blank(),
axis.text.x = element_text(angle = -45, hjust = -.1))
Save figure:
ggsave(here("figures", "compare_kclass_plot.png"), dpi=300, height=5, width=7, units="in")
Use the plot_lca function provided in the folder to plot
the item probability plot. This function requires one argument: -
model_name: The name of the Mplus readModels
object (e.g., output_lsal$c4_lsal.out)
source("plot_lca.txt")
plot_lca(model_name = output_lsal$c4_lsal.out)
Save figure:
ggsave(here("figures", "probability_plot.png"), dpi="retina", height=5, width=7, units="in")
Save response frequencies for the 4-class model with
response is _____.dat under SAVEDATA.
patterns <- mplusObject(
TITLE = "LCA - Save response patterns",
VARIABLE =
"categorical = enjoy useful logical job adult;
usevar = enjoy useful logical job adult;
classes = c(4);",
ANALYSIS =
"estimator = mlr;
type = mixture;
starts = 0;
processors = 10;
optseed = 939021;",
SAVEDATA =
"File=savedata.dat;
Save=cprob;
! Code to save response frequency data
response is resp_patterns.dat;",
OUTPUT = "patterns tech10 tech11 tech14",
usevariables = colnames(lsay_data),
rdata = lsay_data)
patterns_fit <- mplusModeler(patterns,
dataout=here("mplus", "patterns.dat"),
modelout=here("mplus", "patterns.inp") ,
check=TRUE, run = TRUE, hashfilename = FALSE)
Note: You may see an error that says
<simpleError in bivarFitData[mPos, ] <- c(vars, values): number of items to replace is not a multiple of replacement length>,
the developers are aware of this and are working to fix it.
Read in observed response pattern data and relabel the columns
# Read in response frequency data that we just created:
patterns <- read_table(here("mplus", "resp_patterns.dat"),
col_names=FALSE, na = "*")
# Extract the column names
names <- names(readModels(here("mplus", "patterns.out"))[['savedata']])
# Add the names back to the dataset
colnames(patterns) <- c("Frequency", names)
Create a table with the top 5 unconditional response pattern, then top of conditional response pattern for each modal class assignment
# Order responses by highest frequency
order_highest <- patterns %>%
arrange(desc(Frequency))
# Loop `patterns` data to list top 5 conditional response patterns for each class
loop_cond <- lapply(1:max(patterns$C), function(k) {
order_cond <- patterns %>%
filter(C == k) %>%
arrange(desc(Frequency)) %>%
head(5)
})
# Convert loop into data frame
table_data <- as.data.frame(bind_rows(loop_cond))
# Combine unconditional and conditional responses patterns
response_patterns <- rbind(order_highest[1:5,], table_data)
Finally, use {gt} to make a nicely formatted table
resp_table <- response_patterns %>%
gt() %>%
tab_header(
title = "Observed Response Patterns",
subtitle = html("Response patterns, estimated frequencies, estimated posterior class probabilities and modal assignments")) %>%
tab_source_note(
source_note = md("Data Source: **Longitudinal Study of Life (LSAL)**")) %>%
cols_label(
Frequency = html("<i>f</i><sub>r</sub>"),
ENJOY = "Enjoy",
USEFUL = "Useful",
LOGICAL = "Logical",
JOB = "Job",
ADULT = "Adult",
CPROB1 = html("P<sub><i>k</i></sub>=1"),
CPROB2 = html("P<sub><i>k</i></sub>=2"),
CPROB3 = html("P<sub><i>k</i></sub>=3"),
CPROB4 = html("P<sub><i>k</i></sub>=4"), # Change based on number of clases
C = md("*k*")) %>%
tab_row_group(
label = "Unconditional response patterns",
rows = 1:5) %>%
tab_row_group(
label = md("*k* = 1 Conditional response patterns"),
rows = 6:10) %>% #EDIT THESE VALUES BASED ON THE LAST COLUMN
tab_row_group(
label = md("*k* = 2 Conditional response patterns"),
rows = 11:15) %>% #EDIT THESE VALUES BASED ON THE LAST COLUMN
tab_row_group(
label = md("*k* = 3 Conditional response patterns"),
rows = 16:20) %>% #EDIT THESE VALUES BASED ON THE LAST COLUMN
tab_row_group(
label = md("*k* = 4 Conditional response patterns"),
rows = 21:25) %>% #EDIT THESE VALUES BASED ON THE LAST COLUMN
row_group_order(
groups = c("Unconditional response patterns",
md("*k* = 1 Conditional response patterns"),
md("*k* = 2 Conditional response patterns"),
md("*k* = 3 Conditional response patterns"),
md("*k* = 4 Conditional response patterns"))) %>%
tab_footnote(
footnote = html(
"<i>Note.</i> <i>f</i><sub>r</sub> = response pattern frequency; P<sub><i>k</i></sub> = posterior class probabilities"
)
) %>%
cols_align(align = "center") %>%
opt_align_table_header(align = "left") %>%
gt::tab_options(table.font.names = "Times New Roman")
resp_table
| Observed Response Patterns | ||||||||||
| Response patterns, estimated frequencies, estimated posterior class probabilities and modal assignments | ||||||||||
| fr | Enjoy | Useful | Logical | Job | Adult | Pk=1 | Pk=2 | Pk=3 | Pk=4 | k |
|---|---|---|---|---|---|---|---|---|---|---|
| Unconditional response patterns | ||||||||||
| 897 | 1 | 1 | 1 | 1 | 1 | 0.956 | 0.000 | 0.000 | 0.044 | 1 |
| 216 | 0 | 1 | 1 | 1 | 1 | 0.898 | 0.000 | 0.000 | 0.102 | 1 |
| 184 | 0 | 0 | 0 | 0 | 0 | 0.000 | 0.000 | 0.966 | 0.034 | 3 |
| 158 | 1 | 1 | 0 | 1 | 1 | 0.613 | 0.270 | 0.000 | 0.117 | 1 |
| 87 | 1 | 0 | 0 | 0 | 0 | 0.000 | 0.000 | 0.883 | 0.117 | 3 |
| k = 1 Conditional response patterns | ||||||||||
| 897 | 1 | 1 | 1 | 1 | 1 | 0.956 | 0.000 | 0.000 | 0.044 | 1 |
| 216 | 0 | 1 | 1 | 1 | 1 | 0.898 | 0.000 | 0.000 | 0.102 | 1 |
| 158 | 1 | 1 | 0 | 1 | 1 | 0.613 | 0.270 | 0.000 | 0.117 | 1 |
| 84 | 1 | 0 | 1 | 1 | 1 | 0.752 | 0.000 | 0.000 | 0.248 | 1 |
| 60 | 1 | 1 | 1 | 0 | 1 | 0.648 | 0.000 | 0.000 | 0.352 | 1 |
| k = 2 Conditional response patterns | ||||||||||
| 72 | 1 | 0 | 0 | 1 | 1 | 0.099 | 0.768 | 0.000 | 0.134 | 2 |
| 66 | 0 | 1 | 0 | 1 | 1 | 0.343 | 0.495 | 0.000 | 0.161 | 2 |
| 54 | 0 | 0 | 0 | 1 | 1 | 0.033 | 0.855 | 0.000 | 0.112 | 2 |
| 36 | 1 | 1 | 0 | 0 | 1 | 0.143 | 0.537 | 0.000 | 0.320 | 2 |
| 26 | 1 | 0 | 0 | 0 | 1 | 0.012 | 0.797 | 0.000 | 0.191 | 2 |
| k = 3 Conditional response patterns | ||||||||||
| 184 | 0 | 0 | 0 | 0 | 0 | 0.000 | 0.000 | 0.966 | 0.034 | 3 |
| 87 | 1 | 0 | 0 | 0 | 0 | 0.000 | 0.000 | 0.883 | 0.117 | 3 |
| 49 | 0 | 0 | 0 | 1 | 0 | 0.000 | 0.000 | 0.775 | 0.225 | 3 |
| 25 | 0 | 1 | 0 | 0 | 0 | 0.000 | 0.000 | 0.538 | 0.462 | 3 |
| 4 | 0 | NA | 0 | 0 | 0 | 0.000 | 0.000 | 0.915 | 0.085 | 3 |
| k = 4 Conditional response patterns | ||||||||||
| 67 | 1 | 1 | 1 | 1 | 0 | 0.000 | 0.000 | 0.001 | 0.999 | 4 |
| 48 | 0 | 1 | 1 | 1 | 0 | 0.000 | 0.000 | 0.003 | 0.997 | 4 |
| 45 | 1 | 0 | 1 | 1 | 0 | 0.000 | 0.000 | 0.018 | 0.982 | 4 |
| 41 | 1 | 1 | 1 | 0 | 0 | 0.000 | 0.000 | 0.006 | 0.994 | 4 |
| 36 | 1 | 0 | 0 | 1 | 0 | 0.000 | 0.000 | 0.477 | 0.523 | 4 |
| Data Source: Longitudinal Study of Life (LSAL) | ||||||||||
| Note. fr = response pattern frequency; Pk = posterior class probabilities | ||||||||||
Save table:
gtsave(resp_table, here("figures","resp_table.png"))
Use Mplus to calculate k-class confidence intervals (Note: Change the syntax to make your chosen k-class model):
classification <- mplusObject(
TITLE = "LCA - Calculated k-Class 95% CI",
VARIABLE =
"categorical = enjoy-adult;
usevar = enjoy-adult;
classes = c(4);",
ANALYSIS =
"estimator = ml;
type = mixture;
starts = 0;
processors = 10;
optseed = 939021;
bootstrap = 1000;",
MODEL =
"
!CHANGE THIS SECTION TO YOUR CHOSEN k-CLASS MODEL
%OVERALL%
[C#1](c1);
[C#2](c2);
[C#3](c3);
Model Constraint:
New(p1 p2 p3 p4);
p1 = exp(c1)/(1+exp(c1)+exp(c2)+exp(c3));
p2 = exp(c2)/(1+exp(c1)+exp(c2)+exp(c3));
p3 = exp(c3)/(1+exp(c1)+exp(c2)+exp(c3));
p4 = 1/(1+exp(c1)+exp(c2)+exp(c3));",
OUTPUT = "cinterval(bcbootstrap)",
usevariables = colnames(lsay_data),
rdata = lsay_data)
classification_fit <- mplusModeler(classification,
dataout=here("mplus", "lsal.dat"),
modelout=here("mplus", "class.inp") ,
check=TRUE, run = TRUE, hashfilename = FALSE)
Note: Ensure that the classes did not shift during this step (i.g., Class 1 in the enumeration run is now Class 4). Evaluate output and compare the class counts and proportions for the latent classes. Using the OPTSEED function ensures replication of the best loglikelihood value run.
Read in the 4-class model:
# Read in the 4-class model and extract information needed
class_output <- readModels(here("mplus", "class.out"))
# Entropy
entropy <- c(class_output$summaries$Entropy, rep(NA, class_output$summaries$NLatentClasses-1))
# 95% k-Class and k-class 95% Confidence Intervals
k_ci <- class_output$parameters$ci.unstandardized %>%
filter(paramHeader == "New.Additional.Parameters") %>%
unite(CI, c(low2.5,up2.5), sep=", ", remove = TRUE) %>%
mutate(CI = paste0("[", CI, "]")) %>%
rename(kclass=est) %>%
dplyr::select(kclass, CI)
# AvePPk = Average Latent Class Probabilities for Most Likely Latent Class Membership (Row) by Latent Class (Column)
avePPk <- tibble(avePPk = diag(class_output$class_counts$avgProbs.mostLikely))
# mcaPk = modal class assignment proportion
mcaPk <- round(class_output$class_counts$mostLikely,3) %>%
mutate(model = paste0("Class ", class)) %>%
add_column(avePPk, k_ci) %>%
rename(mcaPk = proportion) %>%
dplyr::select(model, kclass, CI, mcaPk, avePPk)
# OCCk = odds of correct classification
OCCk <- mcaPk %>%
mutate(OCCk = round((avePPk/(1-avePPk))/(kclass/(1-kclass)),3))
# Put everything together
class_df <- data.frame(OCCk, entropy)
Now, use {gt} to make a nicely formatted table
class_table <- class_df %>%
gt() %>%
tab_header(
title = "Model Classification Diagnostics for the 4-Class Solution") %>%
cols_label(
model = md("*k*-Class"),
kclass = md("*k*-Class Proportions"),
CI = "95% CI",
mcaPk = md("*mcaPk*"),
avePPk = md("*AvePPk*"),
OCCk = md("*OCCk*"),
entropy = "Entropy") %>%
sub_missing(7,
missing_text = "") %>%
cols_align(align = "center") %>%
opt_align_table_header(align = "left") %>%
gt::tab_options(table.font.names = "Times New Roman")
class_table
| Model Classification Diagnostics for the 4-Class Solution | ||||||
| k-Class | k-Class Proportions | 95% CI | mcaPk | AvePPk | OCCk | Entropy |
|---|---|---|---|---|---|---|
| Class 1 | 0.197 | [0.162, 0.244] | 0.198 | 0.819 | 18.444 | 0.739 |
| Class 2 | 0.106 | [0.086, 0.132] | 0.105 | 0.880 | 61.849 | |
| Class 3 | 0.477 | [0.426, 0.518] | 0.493 | 0.900 | 9.868 | |
| Class 4 | 0.221 | [0.182, 0.263] | 0.204 | 0.823 | 16.390 | |
Save table:
gtsave(class_table, here("figures","class_table.png"))